home *** CD-ROM | disk | FTP | other *** search
- /* WIDE AREA INFORMATION SERVER SOFTWARE:
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
-
- Brewster@think.com
- *
- * $Log: irfiles.h,v $
- * Revision 1.2 1993/06/01 14:05:54 pfeifer
- * Added code for soundex/phonix indexing and retrieval
- *
- * Revision 1.1 1993/02/16 15:05:35 freewais
- * Initial revision
- *
- * Revision 1.19 92/04/16 20:04:44 morris
- * small fix to dictionary_blockword_occurances, lenght read was
- * NEXT_INDEX_BLOCK_SIZE, now its NUMBR_OF_OCCURANCES_SIZE.
- *
- * Revision 1.18 92/03/19 09:34:08 morris
- * fixed the dictionary header to accurately indicate the number of blocks
- *
- * Revision 1.17 92/02/17 12:38:00 jonathan
- * Added defines for catalog.
- *
- */
-
- /* Copyright (c) CNIDR (see ../COPYRIGHT) */
-
-
- /* include file for irfiles.c */
-
- #ifndef IRFILES_H
- #define IRFILES_H
-
- #include "cdialect.h"
- #include "cutil.h"
- #include "hash.h"
- #include "ustubs.h" /* for time_t */
- #include "synonym.h"
-
- /* filename extensions for various components */
- #define dictionary_ext ".dct"
- #define filename_table_ext ".fn"
- #define headline_table_ext ".hl"
- #define document_table_ext ".doc"
- #define index_ext ".inv"
- #define source_ext ".src"
- #define catalog_ext ".cat"
- #define synonym_ext ".syn"
- #ifdef BIO
- #define delimiters_ext ".dlm" /* dgg */
- #endif
-
- /* these dictionary definitions are used in irhash,irverify, and irfiles */
- #define DICTIONARY_HEADER_SIZE 4
- #define DICTIONARY_BLOCK_SIZE 1000L /* in entries, not bytes */
- #define DICTIONARY_ENTRY_HASH_CODE_SIZE 2
- /* #define DICTIONARY_ENTRY_COUNT_SIZE 3 moved to inverted file */
- /* #define DICTIONARY_ENTRY_INDEX_BLOCK_SIZE 4 not used and too long a symbol*/
- /* #define DICTIONARY_ELEMENT_SIZE 6 was 9 */
- #define DICTIONARY_SIZE 524288L
- #define DICTIONARY_TOTAL_SIZE_WORD "{}" /* the word that holds the total number of words in the whole dictionary */
-
- #define INDEX_HEADER_SIZE 4
- #ifdef BIO
-
- /* !! Bug in W8B5 -- Increasing this INDEX_BLOCK_SIZE_SIZE above 2 now fails !! */
- /* it worked in W8B3 ... both 3 and 4 fail now */
- #define INDEX_BLOCK_SIZE_SIZE 2 /* was 2, genbank wants 3, dgg */
-
- #else
- #define INDEX_BLOCK_SIZE_SIZE 2
- #endif
-
- #define NEXT_INDEX_BLOCK_SIZE 4
- #define INDEX_BLOCK_FLAG_SIZE 1
-
- /* dgg -- this is a bug
- #define INDEX_BLOCK_HEADER_SIZE 7
- This == 7 only if the component SIZEs don't change
- */
- #define INDEX_BLOCK_HEADER_SIZE (INDEX_BLOCK_SIZE_SIZE+NEXT_INDEX_BLOCK_SIZE+INDEX_BLOCK_FLAG_SIZE)
-
-
- #define NUMBER_OF_OCCURANCES_SIZE 4
- #define INDEX_BLOCK_NOT_FULL_FLAG 101
- #define INDEX_BLOCK_FULL_FLAG 69
- #define INDEX_BLOCK_DICTIONARY_FLAG 123
-
- #define DOCUMENT_ID_SIZE 4
- #define WORD_POSITION_SIZE 0
- #define CHARACTER_POSITION_SIZE 3
- #define WEIGHT_SIZE 1
- #define INDEX_ELEMENT_SIZE 8
- #define WORD_ID_SIZE 4 /* for posting arrays */
-
- #ifdef BOOLEANS /* dgg */
- #define BOOLEAN_AND "and" /* may prefer "&", but need symbol fix */
- #define BOOLEAN_NOT "not" /* may prefer "!", but need symbol fix */
- #define BOOLEAN_NOT_FLAG -91 /* stick in weight param as flag for search_word */
- #endif
-
- #ifdef PARTIALWORD /* dgg */
- #define PARTWORD_WILDCARD '*'
- #endif
-
- #ifdef LITERAL /* dgg */
- #define LITERAL_KEY1 '"'
- #define LITERAL_KEY2 0x27 /* single quote ' dgg */
- #define LITERAL_FLAG -92 /* stick in weight param as flag for search_word */
- #define MAX_PHRASE_LENGTH 200
- #endif
-
- #ifdef SOUND
- #define SOUNDEX "soundex"
- #define PHONIX "phonix"
- #endif
-
- typedef struct database {
- char* database_file;
- FILE* dictionary_stream;
- FILE* filename_table_stream;
- FILE* headline_table_stream;
- FILE* document_table_stream;
- FILE* index_stream;
- #ifdef BIO
- FILE* delimiters_stream;
- #endif
- long doc_table_allocated_entries;
- hashtable* the_word_memory_hashtable;
-
- long number_of_words_in_hashtable; /* for building.
- checked on every add_word.
- set at start of building,
- and on every flush.*/
- long flush_after_n_words; /* set at the start of building used
- to compare with
- number_of_words_in_hashtable. */
- long number_of_words; /* for building. number of different words.
- Set from the headers of .inv files
- as they are merged.
- It is used to set the header when a .inv
- file is first created (not by merging).
- */
- long index_file_number; /* for building. */
- long total_word_count; /* Total number of word occurances.
- set during indexing, saved in
- dictionary under 'ALL' entry */
- void* ext_database;
- t_Synonym* syn_Table; /* synonym index lookup table */
- int syn_Table_Size; /* number of entries in synonym table */
- } database;
-
- typedef struct document_table_entry {
- long filename_id;
- long headline_id;
- long source_id; /* for signature system */
- long start_character;
- long end_character;
- long document_length; /* in characters */
- long number_of_lines; /* in lines */
- time_t date; /* 0 if unknown */
- } document_table_entry;
-
- #ifdef __cplusplus
- /* declare these as C style functions */
- extern "C"
- {
- #endif /* def __cplusplus */
-
- database* openDatabase _AP((char* name, boolean initialize,boolean for_search));
- void closeDatabase _AP((database* the_db));
- void disposeDatabase _AP((database* the_db));
-
- void initialize_index_files _AP((database* db));
-
- char *read_filename_table_entry _AP((long position,
- char* filename,
- char* type,
- time_t* file_write_date,
- database* db));
-
- long write_filename_table_entry _AP((char* filename, char *type, database* db));
- boolean filename_in_database _AP((char *filename, char *type,
- time_t *write_file_date, database *db));
- boolean filename_in_filename_file _AP ((char *filename, char*type,
- time_t *file_write_date,
- char* filename_file));
- char *read_headline_table_entry _AP((long position,database* db));
- long write_headline_table_entry _AP((char* headline, database* db));
-
- #ifdef BIO
- char *read_delimiters _AP((database* db));
- long write_delimiters _AP((char* delimiters, database* db));
- #endif
-
- boolean read_document_table_entry
- _AP((document_table_entry* doc_entry,long number,database* db));
-
- long write_document_table_entry
- _AP((document_table_entry* doc_table_entry, database* db));
-
- boolean writeUserValToDocIDTable _AP((unsigned long userVal,long doc,
- database* db));
-
-
- long next_document_id _AP((database* db));
-
-
- void close_dictionary_file _AP((database *db));
-
- long add_word_to_dictionary
- _AP((char *word, long index_file_block_number, long number_of_occurances,
- database* db));
- #ifdef PARTIALWORD
- long look_up_partialword_in_dictionary _AP((char *word, long *word_id, database* db));
- #endif
- long look_up_word_in_dictionary _AP((char *word, long *word_id, database* db));
- long init_dict_file_for_writing _AP((database *db));
- void init_dict_file_detailed _AP((FILE* dictionary_stream,
- long number_of_blocks));
- void record_num_blocks_in_dict _AP((FILE* dictionary_stream,
- long number_of_words));
-
- long finished_add_word_to_dictionary _AP((database *db));
-
- boolean register_src_structure _AP((char *filename));
- boolean write_src_structure _AP((char *filename,
- char *database_name,
- char *typename,
- char **filenames,
- long number_of_filename,
- boolean export_database,
- long tcp_port));
-
- boolean build_catalog _AP((database* db));
-
- long allocate_index_block _AP((long how_large, FILE* stream));
-
- unsigned char *read_dictionary_block _AP((unsigned char* block,
- long position,long length,
- FILE* stream));
-
- void print_dictionary _AP((database* db));
-
- #define DICTIONARY_ENTRY_SIZE 29 /* sum of MAX_WORD_LENGTH, 1 ('\0'),
- NEXT_INDEX_BLOCK_SIZE and
- NUMBER_OF_OCCURANCES_SIZE */
-
-
- #ifdef DICT_FUNC
-
- char *dictionary_block_word _AP((long i,unsigned char* block));
- long dictionary_block_position _AP((long i,unsigned char* block));
- long dictionary_block_word_occurances _AP((long i,unsigned char* block));
-
- #else /* macros */
-
- #define dictionary_block_word(i,block) \
- ((char *)((block) + ((i) * DICTIONARY_ENTRY_SIZE)))
-
- #define dictionary_block_position(i,block) \
- read_bytes_from_memory(NEXT_INDEX_BLOCK_SIZE, \
- (block) + ((i) * DICTIONARY_ENTRY_SIZE) + \
- MAX_WORD_LENGTH + 1)
-
- #define dictionary_block_word_occurances(i,block) \
- read_bytes_from_memory(NUMBER_OF_OCCURANCES_SIZE, \
- (block) + ((i) * DICTIONARY_ENTRY_SIZE) + \
- MAX_WORD_LENGTH + 1 + NEXT_INDEX_BLOCK_SIZE)
- #endif
-
- void print_dictionary_block _AP((unsigned char* block,long size));
-
- /* database functions */
- char* dictionary_filename _AP((char* destination, database* db));
- char* filename_table_filename _AP((char* destination, database* db));
- char* headline_table_filename _AP((char* destination, database* db));
- char* document_table_filename _AP((char* destination, database* db));
- char* index_filename _AP((char* destination, database* db));
- char* index_filename_with_version _AP((long version, char* destination,
- database* db));
- char* source_filename _AP((char* destination, database* db));
- #ifdef BIO
- char* delimiters_filename _AP((char* destination, database* db));
- #endif
-
- #ifdef __cplusplus
- }
- #endif /* def __cplusplus */
-
- #endif /* IRFILES_H */
-